library(tidyverse)
library(ggplot2)
library(plotly)
Read in the gapminder_clean.csv data as a tibble using read_csv
data <- read.csv('gapminder_clean.csv') %>%
as_tibble()
Renaming some columns to avoid the repetition of bloated names
colnames(data)[colnames(data) == "CO2.emissions..metric.tons.per.capita."] <- "co2_emissions"
colnames(data)[colnames(data) == "Population.density..people.per.sq..km.of.land.area."] <- "population_density"
colnames(data)[colnames(data) == "Imports.of.goods.and.services....of.GDP."] <- "imports"
colnames(data)[colnames(data) == "Energy.use..kg.of.oil.equivalent.per.capita."] <- "energy_use"
colnames(data)[colnames(data) == "Life.expectancy.at.birth..total..years."] <- "life_expectancy"
data_on_62 <- data %>%
filter(Year==1962)
ggplot(data_on_62, aes(x=co2_emissions, y = gdpPercap)) +
geom_point()+
labs(x="CO2 emissions (metric tons per capita)",y="GDP per capita",
title="GDP per capita variation according to CO2 emissions")
cor_test_res <- cor.test(data_on_62$co2_emissions, data_on_62$gdpPercap)
cor_test_res
##
## Pearson's product-moment correlation
##
## data: data_on_62$co2_emissions and data_on_62$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8934697 0.9489792
## sample estimates:
## cor
## 0.9260817
highest_cor <- data %>%
select(Country.Name,Year,gdpPercap,co2_emissions) %>%
drop_na() %>%
group_by(Year) %>%
summarize(cor = cor(co2_emissions, gdpPercap)) %>%
top_n(1,cor)
summary(highest_cor)
## Year cor
## Min. :1967 Min. :0.9388
## 1st Qu.:1967 1st Qu.:0.9388
## Median :1967 Median :0.9388
## Mean :1967 Mean :0.9388
## 3rd Qu.:1967 3rd Qu.:0.9388
## Max. :1967 Max. :0.9388
co2_gdp_scatterplot <- data_on_62 %>%
select(Country.Name,Year,co2_emissions,continent,pop,gdpPercap) %>%
drop_na() %>%
ggplot(aes(x=co2_emissions,
y=gdpPercap,
color=continent,
size=pop)) +
geom_point(alpha=0.5) +
labs(x="CO2 emissions (metric tons per capita)",y="GDP per capita",
title="GDP per capita variation according to CO2 emissions",) +
scale_color_discrete(name ="Continent") +
scale_size('', range=c(1, 10))
ggplotly(co2_gdp_scatterplot)
energy_continent_anova <- aov(energy_use ~ continent, data = data)
summary(energy_continent_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 5 8.124e+08 162482656 21.88 <2e-16 ***
## Residuals 1404 1.043e+10 7426183
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1197 observations deleted due to missingness
relevant_continents <- c("Europe","Asia")
data_as_eu_after_90 <- data %>%
select(Country.Name,Year,imports,continent) %>%
filter(continent %in% relevant_continents, Year>1990)
europe_asia_imports_t_test <- t.test(data_as_eu_after_90$imports[data_as_eu_after_90$continent=='Europe'],data_as_eu_after_90$Imports.of.goods.and.services....of.GDP.[data_as_eu_after_90$continent=='Asia'])
## Warning: Unknown or uninitialised column:
## `Imports.of.goods.and.services....of.GDP.`.
europe_asia_imports_t_test
##
## One Sample t-test
##
## data: data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Europe"]
## t = 26.684, df = 113, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 38.68652 44.89196
## sample estimates:
## mean of x
## 41.78924
years <- unique(data$Year)
countries = unique(data$Country.Name[!is.na(data$Country.Name)])
pop_density_ranking <- rep(0, times=length(countries))
names(pop_density_ranking) <- countries
for (x in years) {
year_data <- data %>%
select(Country.Name,Year,population_density) %>%
na.omit() %>%
filter(Year == x)
year_data$population_density <- rank(year_data$population_density,na.last = TRUE)
for (z in year_data$Country.Name) {
pop_density_ranking[[z]] <- pop_density_ranking[[z]] + year_data$population_density[year_data$Country.Name==z]
}
}
pop_density_ranking <- pop_density_ranking %>%
sort(decreasing = TRUE) %>%
replace(pop_density_ranking==0, NA)
head(pop_density_ranking)
## Macao SAR, China Monaco Hong Kong SAR, China
## 2553 2553 2537
## Singapore Gibraltar Bermuda
## 2529 2518 2506
pop_density_plot <- ggplot(data, aes(x = Year, y = log10(population_density), group = Country.Name, color = Country.Name, label = Country.Name)) +
geom_line() +
labs(y="Population density (log10(people/square km of land area))",x="Year",
title="Population density variation according to year per country",)
ggplotly(pop_density_plot)
life_expectancy_diff <- rep(0, times=length(countries))
names(life_expectancy_diff) <- countries
for (z in countries) {
country_life_exp <- data %>%
select(Country.Name,Year,life_expectancy) %>%
filter(Country.Name==z)
years <- unique(country_life_exp$Year)
life_expectancy_diff[z] <- country_life_exp$life_expectancy[country_life_exp$Year==tail(years,1)] - country_life_exp$life_expectancy[country_life_exp$Year==head(years,1)]
}
life_expectancy_diff <- life_expectancy_diff %>%
sort(decreasing = TRUE)
head(life_expectancy_diff)
## Maldives Bhutan Timor-Leste Tunisia Oman Nepal
## 36.91615 33.19895 31.08515 30.86076 30.82310 30.59963
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.